{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 18,
      "metadata": {},
      "outputs": [],
      "source": [
        "from deepeval import evaluate\n",
        "from deepeval.metrics import GEval, FaithfulnessMetric, ContextualRelevancyMetric\n",
        "from deepeval.test_case import LLMTestCase, LLMTestCaseParams"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Test Correctness"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "correctness_metric = GEval(\n",
        "    name=\"Correctness\",\n",
        "    model=\"gpt-4o\",\n",
        "    evaluation_params=[\n",
        "        LLMTestCaseParams.EXPECTED_OUTPUT,\n",
        "        LLMTestCaseParams.ACTUAL_OUTPUT],\n",
        "        evaluation_steps=[\n",
        "        \"Determine whether the actual output is factually correct based on the expected output.\"\n",
        "    ],\n",
        "\n",
        ")\n",
        "\n",
        "gt_answer = \"Madrid is the capital of Spain.\"\n",
        "pred_answer = \"MadriD.\"\n",
        "\n",
        "test_case_correctness = LLMTestCase(\n",
        "    input=\"What is the capital of Spain?\",\n",
        "    expected_output=gt_answer,\n",
        "    actual_output=pred_answer,\n",
        ")\n",
        "\n",
        "correctness_metric.measure(test_case_correctness)\n",
        "print(correctness_metric.score)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Test faithfulness"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "question = \"what is 3+3?\"\n",
        "context = [\"6\"]\n",
        "generated_answer = \"6\"\n",
        "\n",
        "faithfulness_metric = FaithfulnessMetric(\n",
        "    threshold=0.7,\n",
        "    model=\"gpt-4\",\n",
        "    include_reason=False\n",
        ")\n",
        "\n",
        "test_case = LLMTestCase(\n",
        "    input = question,\n",
        "    actual_output=generated_answer,\n",
        "    retrieval_context=context\n",
        "\n",
        ")\n",
        "\n",
        "faithfulness_metric.measure(test_case)\n",
        "print(faithfulness_metric.score)\n",
        "print(faithfulness_metric.reason)\n",
        "\n"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Test contextual relevancy "
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "actual_output = \"then go somewhere else.\"\n",
        "retrieval_context = [\"this is a test context\",\"mike is a cat\",\"if the shoes don't fit, then go somewhere else.\"]\n",
        "gt_answer = \"if the shoes don't fit, then go somewhere else.\"\n",
        "\n",
        "relevance_metric = ContextualRelevancyMetric(\n",
        "    threshold=1,\n",
        "    model=\"gpt-4\",\n",
        "    include_reason=True\n",
        ")\n",
        "relevance_test_case = LLMTestCase(\n",
        "    input=\"What if these shoes don't fit?\",\n",
        "    actual_output=actual_output,\n",
        "    retrieval_context=retrieval_context,\n",
        "    expected_output=gt_answer,\n",
        "\n",
        ")\n",
        "\n",
        "relevance_metric.measure(relevance_test_case)\n",
        "print(relevance_metric.score)\n",
        "print(relevance_metric.reason)"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 34,
      "metadata": {},
      "outputs": [],
      "source": [
        "new_test_case = LLMTestCase(\n",
        "    input=\"What is the capital of Spain?\",\n",
        "    expected_output=\"Madrid is the capital of Spain.\",\n",
        "    actual_output=\"MadriD.\",\n",
        "    retrieval_context=[\"Madrid is the capital of Spain.\"]\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Test two different cases together with several metrics together"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "evaluate(\n",
        "    test_cases=[relevance_test_case, new_test_case],\n",
        "    metrics=[correctness_metric, faithfulness_metric, relevance_metric]\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "### Funcion to create multiple LLMTestCases based on four lists: \n",
        "* Questions\n",
        "* Ground Truth Answers\n",
        "* Generated Answers\n",
        "* Retrieved Documents - Each element is a list"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {},
      "outputs": [],
      "source": [
        "def create_deep_eval_test_cases(questions, gt_answers, generated_answers, retrieved_documents):\n",
        "    return [\n",
        "        LLMTestCase(\n",
        "            input=question,\n",
        "            expected_output=gt_answer,\n",
        "            actual_output=generated_answer,\n",
        "            retrieval_context=retrieved_document\n",
        "        )\n",
        "        for question, gt_answer, generated_answer, retrieved_document in zip(\n",
        "            questions, gt_answers, generated_answers, retrieved_documents\n",
        "        )\n",
        "    ]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "![](https://europe-west1-rag-techniques-views-tracker.cloudfunctions.net/rag-techniques-tracker?notebook=evaluation--evaluation-deep-eval)"
      ]
    }
  ],
  "metadata": {
    "colab": {
      "name": "evaluation_deep_eval.ipynb",
      "private_outputs": true,
      "provenance": []
    },
    "kernelspec": {
      "display_name": ".venv",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.12.0"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}